1 Preprocessing Code

Note: Code chunks are collated and echoed at the end of the document in Appendix I.

1.1 Import data into R and generate a Digital Gene Expression List

Generate a digital gene expression list that could be easily shared/loaded for downstream filtering/normalization.

species_list <- tibble(species = c('polya_embryonic', 'wholeRNA_embryonic'))

for (x in species_list$species) {
# read in the study design ----
targets <- read_tsv(paste0("./Data/cele_embryonic/", x,"_study_design.txt"),
                    na = c("", "NA", "na"), show_col_types = F)

# load pre-generated annotation information
load(paste0("./Outputs/elegans_geneAnnotations"))

# import featureCount output into R ----
path <- paste0("./Data/cele_embryonic/featureCount.C_elegans.", targets$Biological_ID, ".CDS.unique_only.ws290.txt")

featureCountData<- rbindlist(lapply(path, fread), idcol="sample") %>%
mutate(sample = targets$sample[sample])
colnames(featureCountData)<-c('sample','geneID', 'Ce_ortholog', 'stableID', "location", "length", "count")

featureCountData_wider <- featureCountData %>%
  dplyr::select(!c(Ce_ortholog, location, length)) %>%
  pivot_wider(names_from = sample, values_from = count)

counts <- featureCountData_wider %>%
  dplyr::select(-stableID)%>%
  column_to_rownames(var = "geneID")

annotations_sub<-dplyr::select(featureCountData_wider, geneID) %>%
  left_join(annotations, by = "geneID") 

# generate a DGEList
myDGEList <- DGEList(counts,
                     samples = targets$sample,
                     group = targets$group, 
                     genes = annotations_sub)

output.name <- paste0(x, '_DGEList')
save(myDGEList,
     file = file.path(output.path,
                      output.name))
}

1.2 Data Filtering, Normalization, and Variance Stabilization

The goal of this chunk is to:

  1. Filter and normalize data.
  2. Use ggplot2 to visualize the impact of filtering and normalization on the data (see Output section, below).
  3. ses a DGEList of filtered and normalized abundance data. It will fit data to a linear model for responsively detecting differentially expressed genes (DGEs).
  4. Save the following files and datasets:
    i) filtered and normalized (but not voom adjusted) log2CPM values.
    ii) a matrix of discarded genes and their raw counts - this data is downloadable from within the Shiny Browser App. iii) the variance-stabilized vDGEList, saved as an R object (_vDGEList). iv) a matrix of variance-stabilized gene expression data, extracted from the vDGEList (_log2cpm_filtered_norm_voom.csv) - this data is downloadable from within the Browser App.
species_list <- tibble(species = c('polya_embryonic', 'wholeRNA_embryonic'))

for (x in species_list$species) {
# load pre-generated DGEList information
load(paste0("./Outputs/",x,"_DGEList"))

# load pre-generated annotation information
load(paste0("./Outputs/cele_embryonic_geneAnnotations"))  
  
# read in the study design ----
targets <- read_tsv(paste0("./Data/cele_embryonic/", x,"_study_design.txt"),
                    na = c("", "NA", "na"), show_col_types = F)
# Generate life stage IDs
ids <- rep(cbind(targets$group), 
           times = nrow(myDGEList$counts)) %>%
  as_factor()

# calculate and plot log2 counts per million ----
# use the 'cpm' function from EdgeR to get log2 counts per million
# then coerce into a tibble
log2.cpm.df.pivot <- cpm(myDGEList, log=TRUE) %>%
  as_tibble(rownames = "geneID") %>%
  setNames(nm = c("geneID", targets$sample)) %>%
  pivot_longer(cols = -geneID, 
               names_to = "samples", 
               values_to = "expression") %>% 
  add_column(life_stage = ids)

# plot the data
p1 <- ggplot(log2.cpm.df.pivot) +
  aes(x=samples, y=expression, fill=life_stage) +
  geom_violin(trim = FALSE, show.legend = T, alpha= 0.7) +
  stat_summary(fun = "median", 
               geom = "point", 
               shape = 20, 
               size = 2, 
               color = "black", 
               show.legend = FALSE) +
  labs(y="log2 expression", x = "sample",
       title = paste0("C. ", x, ": Log2 Counts per Million (CPM)"),
       subtitle="unfiltered, non-normalized",
       caption=paste0("produced on ", Sys.time())) +
  theme_bw() +
  coord_flip()
# Filter the data ----

# filter genes/transcripts with low counts
# how many genes had more than 1 CPM (TRUE) in at least n samples
# Note: The cutoff "n" is adjusted for the number of 
# samples in the smallest group of comparison.
keepers <- cpm(myDGEList) %>%
  rowSums(.>1)>=1

myDGEList.filtered <- myDGEList[keepers,]

ids.filtered <- rep(cbind(targets$group), 
                    times = nrow(myDGEList.filtered)) %>%
  as_factor()

log2.cpm.filtered.df.pivot <- cpm(myDGEList.filtered, log=TRUE) %>%
  as_tibble(rownames = "geneID") %>%
  setNames(nm = c("geneID", targets$sample)) %>%
  pivot_longer(cols = -geneID,
               names_to = "samples",
               values_to = "expression") %>%
  add_column(life_stage = ids.filtered)

p2 <- ggplot(log2.cpm.filtered.df.pivot) +
  aes(x=samples, y=expression, fill=life_stage) +
  geom_violin(trim = FALSE, show.legend = T, alpha= 0.7) +
  stat_summary(fun = "median", 
               geom = "point", 
               shape = 20, 
               size = 2, 
               color = "black", 
               show.legend = FALSE) +
  labs(y="log2 expression", x = "sample",
       title = paste0("C. ", x, ": Log2 Counts per Million (CPM)"),
       subtitle="filtered, non-normalized",
       caption=paste0("produced on ", Sys.time())) +
  theme_bw() +
  coord_flip()

# Look at the genes excluded by the filtering step ----
# just to check that there aren't any with 
# high expression that are in few samples
# Discarded genes
myDGEList.discarded <- myDGEList[!keepers,]

ids.discarded <- rep(cbind(targets$group), 
                     times = nrow(myDGEList.discarded)) %>%
  as_factor()

log2.cpm.discarded.df.pivot <- cpm(myDGEList.discarded, log=F) %>%
  as_tibble(rownames = "geneID") %>%
  setNames(nm = c("geneID", targets$sample)) %>%
  pivot_longer(cols = -geneID,
               names_to = "samples",
               values_to = "expression") %>%
  add_column(life_stage = ids.discarded)

# Genes that are above 1 cpm
log2.cpm.discarded.df.pivot %>%
  dplyr::filter(expression > 1)

# Generate a matrix of discarded genes and their raw counts ----
discarded.gene.df <- log2.cpm.discarded.df.pivot %>%
  pivot_wider(names_from = c(life_stage, samples), 
              names_sep = "-", 
              values_from = expression, 
              id_cols = geneID)%>%
  left_join(annotations, by = "geneID") 

# Save a matrix of discarded genes and their raw counts ----
output.name <- paste0(x, '_discardedGenes.csv')

discarded.gene.df %>%    
write.csv(file = file.path(output.path,
                           output.name))

# Plot discarded genes
p.discarded <- ggplot(log2.cpm.discarded.df.pivot) +
  aes(x=samples, y=expression, color=life_stage) +
  geom_jitter(alpha = 0.3, show.legend = T)+
  stat_summary(fun = "median", 
               geom = "point", 
               shape = 20, 
               size = 2, 
               color = "black", 
               show.legend = FALSE) +
  labs(y="expression", x = "sample",
       title = paste0("C. ", x, ": Counts per Million (CPM)"),
       subtitle="genes excluded by low count filtering step, non-normalized",
       caption=paste0("produced on ", Sys.time())) +
  theme_bw() +
  coord_flip()
  
# Normalize the data using a between samples normalization ----
# Source for TMM sample normalization here:
# https://genomebiology.biomedcentral.com/articles/10.1186/gb-2010-11-3-r25
myDGEList.filtered.norm <- calcNormFactors(myDGEList.filtered, method = "TMM")

log2.cpm.filtered.norm <- cpm(myDGEList.filtered.norm, log=TRUE)

log2.cpm.filtered.norm.df<- cpm(myDGEList.filtered.norm, log=TRUE) %>%
  as_tibble(rownames = "geneID") %>%
  setNames(nm = c("geneID", targets$sample))

log2.cpm.filtered.norm.df.pivot<-log2.cpm.filtered.norm.df %>%
  pivot_longer(cols = -geneID,
               names_to = "samples",
               values_to = "expression") %>%
  add_column(life_stage = ids.filtered)

p3 <- ggplot(log2.cpm.filtered.norm.df.pivot) +
  aes(x=samples, y=expression, fill=life_stage) +
  geom_violin(trim = FALSE, show.legend = T, alpha = 0.7) +
  stat_summary(fun = "median", 
               geom = "point", 
               shape = 20, 
               size = 2, 
               color = "black", 
               show.legend = FALSE) +
  labs(y="log2 expression", x = "sample",
       title = paste0("C. ", x, ": Log2 Counts per Million (CPM)"),
       subtitle="filtered, TMM normalized",
       caption=paste0("produced on ", Sys.time())) +
  theme_bw() +
  coord_flip()

output.name <- paste0(x, '_FilteringNormalizationGraphs')
save(p1, p2, p3, p.discarded, discarded.gene.df,
     file = file.path(output.path,
                      output.name))

output.name <- paste0(x, '_DGEList_filtered_normalized')
save(myDGEList.filtered.norm,
     file = file.path(output.path,
                      output.name))


# Compute Variance-Stabilized DGEList Object ----

# Set up the design matrix ----
# no intercept/blocking for matrix, comparisons across group
group <- factor(targets$group)
design <- model.matrix(~0 + group) 
colnames(design) <- levels(group)

# NOTE: To handle a 'blocking' design' or a batch effect, use:
# design <- model.matrix(~block + treatment)

# Model mean-variance trend and fit linear model to data ----
# Use VOOM function from Limma package to model the mean-variance relationship
# produces a variance-stabilized DGEList, that include precision 
# weights for each gene to try and control for heteroscedasity.
# transforms count data to log2-counts per million
# Outputs: E = normalized expression values on the log2 scale
v.DGEList.filtered.norm <- voom(counts = myDGEList.filtered.norm, 
                                design = design, plot = T)
colnames(v.DGEList.filtered.norm)<-targets$sample
colnames(v.DGEList.filtered.norm$E) <- paste(targets$group, 
                                             targets$sample,sep = '-')


# Save matrix of genes and their filtered, normalized, voom-transformed counts ----
# This is the count data that underlies the differential expression analyses in the Shiny app. 
# Saving it here so that users of the app can access the input information.
output.name <- paste0(x, '_log2cpm_filtered_norm_voom.csv')
# Save in Shiny app download (www) folder
write.csv(v.DGEList.filtered.norm$E, 
          file = file.path(www.path,
                           output.name))
# Save v.DGEList ----
# Save in Shiny app Data folder
output.name <- paste0(x, '_vDGEList')
save(v.DGEList.filtered.norm,
     file = file.path(app.path,
                      output.name))

}

1.3 C. elegans PolyA+ Early Embryo Detailed Timeline Filtering Results

1.3.1 Unfiltered, non-normalized log2CPM data

1.3.2 Filtered, non-normalized log2CPM data

1.3.3 Filtered, normalized log2CPM data by life stage

1.3.4 Genes discarded by low-copy filtering step

1.3.5 Discarded gene list

1.4 C. elegans Whole RNA Early Embryo Filtering Results

1.4.1 Unfiltered, non-normalized log2CPM data

1.4.2 Filtered, non-normalized log2CPM data

1.4.3 Filtered, normalized log2CPM data by life stage

1.4.4 Genes discarded by low-copy filtering step

1.4.5 Discarded gene list

2 Analysis of Whole RNA and PolyA+ Early Embryo Data

The goal of this section is to see if these two data sets can be combined for a joint analysis.

2.1 Import data into R and generate a Digital Gene Expression List, Data Filtering, Normalization, and Variance Stabilization

targets <- read_tsv(paste0("./Data/cele_embryonic/embryonic_study_design.txt"),
                    na = c("", "NA", "na"), show_col_types = F)

# load pre-generated annotation information
load(paste0("./Outputs/elegans_geneAnnotations"))

# import featureCount output into R ----
path <- paste0("./Data/cele_embryonic/featureCount.C_elegans.", targets$Biological_ID, ".CDS.unique_only.ws290.txt")

featureCountData<- rbindlist(lapply(path, fread), idcol="sample") %>%
mutate(sample = targets$sample[sample])
colnames(featureCountData)<-c('sample','geneID', 'Ce_ortholog', 'stableID', "location", "length", "count")

featureCountData_wider <- featureCountData %>%
  dplyr::select(!c(Ce_ortholog, location, length)) %>%
  pivot_wider(names_from = sample, values_from = count)

counts <- featureCountData_wider %>%
  dplyr::select(-stableID)%>%
  column_to_rownames(var = "geneID")

annotations_sub<-dplyr::select(featureCountData_wider, geneID) %>%
  left_join(annotations, by = "geneID") 

# generate a DGEList
myDGEList <- DGEList(counts,
                     samples = targets$sample,
                     group = targets$group, 
                     genes = annotations_sub)

# Generate life stage IDs
ids <- rep(cbind(targets$group), 
           times = nrow(myDGEList$counts)) %>%
  as_factor()

# calculate log2 counts per million ----
# use the 'cpm' function from EdgeR to get log2 counts per million
# then coerce into a tibble
log2.cpm.df.pivot <- cpm(myDGEList, log=TRUE) %>%
  as_tibble(rownames = "geneID") %>%
  setNames(nm = c("geneID", targets$sample)) %>%
  pivot_longer(cols = -geneID, 
               names_to = "samples", 
               values_to = "expression") %>% 
  add_column(life_stage = ids)


# Filter the data ----

# filter genes/transcripts with low counts
# how many genes had more than 1 CPM (TRUE) in at least n samples
# Note: The cutoff "n" is adjusted for the number of 
# samples in the smallest group of comparison.
keepers <- cpm(myDGEList) %>%
  rowSums(.>1)>=1

myDGEList.filtered <- myDGEList[keepers,]

ids.filtered <- rep(cbind(targets$group), 
                    times = nrow(myDGEList.filtered)) %>%
  as_factor()

# Normalize the data using a between samples normalization ----
# Source for TMM sample normalization here:
# https://genomebiology.biomedcentral.com/articles/10.1186/gb-2010-11-3-r25
myDGEList.filtered.norm <- calcNormFactors(myDGEList.filtered, method = "TMM")

log2.cpm.filtered.norm <- cpm(myDGEList.filtered.norm, log=TRUE)

log2.cpm.filtered.norm.df<- cpm(myDGEList.filtered.norm, log=TRUE) %>%
  as_tibble(rownames = "geneID") %>%
  setNames(nm = c("geneID", targets$sample))

# Compute Variance-Stabilized DGEList Object ----

# Set up the design matrix ----
group <- factor(targets$stage)
block <- factor(targets$block)

# NOTE: For no intercept/blocking for matrix, comparisons across group:
#design <- model.matrix(~0 + group) 
#colnames(design) <- levels(group)

# NOTE: To handle a 'blocking' design' or a batch effect, use:
design <- model.matrix(~block + group)

# Model mean-variance trend and fit linear model to data ----
# Use VOOM function from Limma package to model the mean-variance relationship
# produces a variance-stabilized DGEList, that include precision 
# weights for each gene to try and control for heteroscedasity.
# transforms count data to log2-counts per million
# Outputs: E = normalized expression values on the log2 scale
v.DGEList.filtered.norm <- voom(counts = myDGEList.filtered.norm, 
                                design = design, plot = T)

colnames(v.DGEList.filtered.norm)<-targets$sample
colnames(v.DGEList.filtered.norm$E) <- paste(targets$block,targets$group, 
                                             targets$sample,sep = '-')

2.2 Hierarchical Clustering and Principle Components Analysis

This code chunk starts with filtered and normalized abundance data in a data frame (not tidy). It will implement hierarchical clustering and PCA analyses on the data. It will plot various graphs, including a dendrogram of the heirachical clustering, and several plots of visualize the PCA.


# Identify variables of interest in study design file ----
group <- factor(targets$stage)

# Hierarchical clustering ---------------
# Remember: hierarchical clustering can only work on a data matrix, not a data frame

# Calculate distance matrix
# dist calculates distance between rows, so transpose data so that we get distance between samples.
# how similar are samples from each other
colnames(log2.cpm.filtered.norm)<- targets$group
distance <- dist(t(log2.cpm.filtered.norm), method = "maximum") #other distance methods are "euclidean", maximum", "manhattan", "canberra", "binary" or "minkowski"

# Calculate clusters to visualize differences. This is the hierarchical clustering.
# The methods here include: single (i.e. "friends-of-friends"), complete (i.e. complete linkage), and average (i.e. UPGMA). Here's a comparison of different types: https://en.wikipedia.org/wiki/UPGMA#Comparison_with_other_linkages
clusters <- hclust(distance, method = "complete") #other agglomeration methods are "ward.D", "ward.D2", "single", "complete", "average", "mcquitty", "median", or "centroid"
dend <- as.dendrogram(clusters) 

p1<-dend %>% 
  dendextend::set("branches_k_color", k = 5) %>% 
  dendextend::set("hang_leaves", c(0.1)) %>% 
  dendextend::set("labels_cex", c(0.5)) %>%
  dendextend::set("labels_colors", k = 5) %>% 
  dendextend::set("branches_lwd", c(0.7)) %>% 
  as.ggdend %>%
  ggplot (offset_labels = -0.5) +
  theme_dendro() +
  ylim(-8, max(get_branches_heights(dend))) +
  labs(title = "Hierarchical Cluster Dendrogram ",
       subtitle = "filtered, TMM normalized",
       y = "Distance",
       x = "Life stage") +
  coord_fixed(1/2) +
  theme(axis.title.x = element_text(color = "black"),
        axis.title.y = element_text(angle = 90),
        axis.text.y = element_text(angle = 0),
        axis.line.y = element_line(color = "black"),
        axis.ticks.y = element_line(color = "black"),
        axis.ticks.length.y = unit(2, "mm"))

2.2.1 Dendrogram to visualize hierachical clustering

Clustering performed on filtered and normalized abundance data using the “complete” method.


# Principal component analysis (PCA) -------------
# this also works on a data matrix, not a data frame
pca.res <- prcomp(t(log2.cpm.filtered.norm), scale.=F, retx=T)
summary(pca.res) # Prints variance summary for all principal components.
## Importance of components:
##                             PC1     PC2      PC3      PC4      PC5      PC6
## Standard deviation     151.1711 71.3233 52.77388 40.28261 33.84034 27.82526
## Proportion of Variance   0.5582  0.1242  0.06803  0.03963  0.02797  0.01891
## Cumulative Proportion    0.5582  0.6824  0.75045  0.79008  0.81805  0.83696
##                             PC7     PC8      PC9     PC10     PC11     PC12
## Standard deviation     24.18156 20.1370 17.76894 17.36531 16.41680 16.27187
## Proportion of Variance  0.01428  0.0099  0.00771  0.00737  0.00658  0.00647
## Cumulative Proportion   0.85125  0.8611  0.86886  0.87623  0.88281  0.88928
##                            PC13     PC14    PC15     PC16    PC17     PC18
## Standard deviation     15.54952 14.36360 14.1614 13.75806 13.2620 13.12308
## Proportion of Variance  0.00591  0.00504  0.0049  0.00462  0.0043  0.00421
## Cumulative Proportion   0.89518  0.90022  0.9051  0.90974  0.9140  0.91825
##                           PC19     PC20     PC21     PC22    PC23    PC24
## Standard deviation     12.4693 12.31984 11.98556 11.76961 11.4539 10.8951
## Proportion of Variance  0.0038  0.00371  0.00351  0.00338  0.0032  0.0029
## Cumulative Proportion   0.9220  0.92575  0.92926  0.93264  0.9358  0.9387
##                            PC25     PC26     PC27     PC28    PC29    PC30
## Standard deviation     10.77788 10.47822 10.40139 10.22759 9.98258 9.94537
## Proportion of Variance  0.00284  0.00268  0.00264  0.00255 0.00243 0.00242
## Cumulative Proportion   0.94158  0.94427  0.94691  0.94946 0.95190 0.95431
##                           PC31    PC32    PC33    PC34    PC35    PC36    PC37
## Standard deviation     9.65700 9.54413 9.39232 9.37475 9.24814 9.17675 9.08021
## Proportion of Variance 0.00228 0.00222 0.00215 0.00215 0.00209 0.00206 0.00201
## Cumulative Proportion  0.95659 0.95882 0.96097 0.96312 0.96521 0.96726 0.96928
##                           PC38    PC39    PC40    PC41    PC42   PC43    PC44
## Standard deviation     8.90145 8.83455 8.47314 8.42971 8.22338 8.0961 8.00872
## Proportion of Variance 0.00194 0.00191 0.00175 0.00174 0.00165 0.0016 0.00157
## Cumulative Proportion  0.97121 0.97312 0.97487 0.97661 0.97826 0.9799 0.98143
##                           PC45    PC46    PC47    PC48    PC49    PC50    PC51
## Standard deviation     7.80177 7.74659 7.68201 7.50380 7.42730 7.22874 7.10471
## Proportion of Variance 0.00149 0.00147 0.00144 0.00138 0.00135 0.00128 0.00123
## Cumulative Proportion  0.98291 0.98438 0.98582 0.98720 0.98854 0.98982 0.99105
##                          PC52   PC53   PC54   PC55    PC56    PC57    PC58
## Standard deviation     7.0069 6.7095 6.3853 6.0636 5.94847 5.63466 5.30005
## Proportion of Variance 0.0012 0.0011 0.0010 0.0009 0.00086 0.00078 0.00069
## Cumulative Proportion  0.9922 0.9933 0.9943 0.9952 0.99611 0.99689 0.99757
##                           PC59    PC60    PC61    PC62      PC63
## Standard deviation     5.25704 5.11576 4.89976 4.64896 1.338e-13
## Proportion of Variance 0.00068 0.00064 0.00059 0.00053 0.000e+00
## Cumulative Proportion  0.99825 0.99889 0.99947 1.00000 1.000e+00

#pca.res$rotation #$rotation shows you how much each gene influenced each PC (called 'scores')
#pca.res$x # 'x' shows you how much each sample influenced each PC (called 'loadings')
#note that these have a magnitude and a direction (this is the basis for making a PCA plot)
## This generates a screeplot: a standard way to view eigenvalues for each PCA. Shows the proportion of variance accounted for by each PC. Plotting only the first 10 dimensions.
p2<-fviz_eig(pca.res,
             #barcolor = brewer.pal(8,"Pastel2")[8],
             #barfill = brewer.pal(8,"Pastel2")[8],
             linecolor = "black",
             main = "Scree plot: proportion of variance accounted for by each principal component", ggtheme = theme_bw()) 

2.2.2 Screeplot of PCA Eigenvalues

A scree plot is a standard way to view eigenvalues for each PCA. The plot shows the proportion of variance accounted for by each PC.

p2


pc.var<-pca.res$sdev^2 # sdev^2 captures these eigenvalues from the PCA result
pc.per<-round(pc.var/sum(pc.var)*100, 1) # we can then use these eigenvalues to calculate the percentage variance explained by each PC

# Visualize the PCA result ------------------
#lets first plot any two PCs against each other
#We know how much each sample contributes to each PC (loadings), so let's plot
pca.res.df <- as_tibble(pca.res$x)

# Plotting PC1 and PC2
p3<-ggplot(pca.res.df) +
  aes(x=PC1, y=PC2, label=targets$stage, 
      fill = targets$stage,
      color = targets$stage, 
      shape = targets$block
  ) +
  geom_point(size=4) +
  xlab(paste0("PC1 (",pc.per[1],"%",")")) + 
  ylab(paste0("PC2 (",pc.per[2],"%",")")) +
  labs(title="Principal Components Analysis",
       sub = "Note: analysis is blind to sample identity.",
       shape = "Experiment", 
       fill = "Time point",
       color = "Time point") +
  scale_x_continuous(expand = c(.3, .3)) +
  scale_y_continuous(expand = c(.3, .3)) +
  coord_fixed() +
  theme_bw() +
  theme(text = element_text(size = 10),
        title = element_text(size = 10))

2.2.3 PCA Plot

Plot of the samples in PCA space. Fill color indicates life stage.


# Create a PCA 'small multiples' chart ----
pca.res.df <- pca.res$x[,1:6] %>% 
  as_tibble() %>%
  add_column(sample = targets$sample,
             group = targets$stage)

pca.pivot <- pivot_longer(pca.res.df, # dataframe to be pivoted
                          cols = PC1:PC3, # column names to be stored as a SINGLE variable
                          names_to = "PC", # name of that new variable (column)
                          values_to = "loadings") # name of new variable (column) storing all the values (data)
PC1<-subset(pca.pivot, PC == "PC1")
PC2 <-subset(pca.pivot, PC == "PC2")
PC3 <- subset(pca.pivot, PC == "PC3")
#PC4 <- subset(pca.pivot, PC == "PC4")

# New facet label names for PCs
PC.labs <- c(paste0("PC1 (",pc.per[1],"%",")"),
             paste0("PC2 (",pc.per[2],"%",")"),
             paste0("PC3 (",pc.per[3],"%",")")
)
names(PC.labs) <- c("PC1", "PC2", "PC3")

p6<-ggplot(pca.pivot) +
  aes(x=sample, y=loadings) + # you could iteratively 'paint' different covariates onto this plot using the 'fill' aes
  geom_bar(stat="identity") +
  facet_wrap(~PC, labeller = labeller(PC = PC.labs)) +
  geom_bar(data = PC1, stat = "identity", aes(fill = group)) +
  geom_bar(data = PC2, stat = "identity", aes(fill = group)) +
  geom_bar(data = PC3, stat = "identity", aes(fill = group)) +
  labs(title="PCA 'small multiples' plot",
       fill = "Life Stage",
       caption=paste0("produced on ", Sys.time())) +
  scale_x_discrete(limits = targets$sample, labels = targets$group) +
  theme_bw() +
  theme(text = element_text(size = 10),
        title = element_text(size = 10)) +
  coord_flip()

2.2.4 PCA “Small Multiples” Plot